Load and transpose counts file (Sample IDs as header and miRNA IDs as first column)
oriCountsFile <- read.csv(file = paste(sep="", data_path,'/mature_counts_all.csv'))
countsFile <- as.data.frame(t(oriCountsFile)) # transpose oriCountsFile
countsFile <- janitor::row_to_names(countsFile, row_number=1)
countsFile <- tibble::rownames_to_column(countsFile, var="miR")
countsFile$miR <- chartr('.', '-', countsFile$miR) # replace '.' with '-' in miR
countsFile
Fetch the mirbase22 IDs and sequences
mrbse <- read.table(mirbaseFile, header = T, sep = "\t")
mrbse <- mrbse[-c(1,2),] # The first two rows don't match up with mirbase, are the only duplicates, and are the only sequences with T's for some reason. I'll remove them.
dim(mrbse)
[1] 2656 2
head(mrbse)
Create microRNA Mapping table and Update Countfile miRNA Row Names
mrbse2 <- merge(mrbse, update, by.x = 1, by.y = 2, all.x = F, all.y = T)
mrbse2 <- mrbse2[,-4]
dim(mrbse2)
[1] 193 3
head(mrbse2)
rownames(mrbse2)<- mrbse2[,"OriginalName"]
rownames(countsFile)<-countsFile[,1]
rownames(countsFile)<- mrbse2[rownames(countsFile), "Name"]
Create Row for each miRNA and Merge this with the data
mergedData <- matrix(0, nrow = dim(mrbse)[1], ncol= 302)
mergedData <- data.frame(mergedData)
mergedData<- cbind(mrbse, mergedData)
colnames(mergedData)<- c("Name", "Seq", colnames(countsFile)[-1])
rownames(mergedData)<-mergeData[,1]
mergedData[rownames(countsFile), colnames(countsFile)[-1]]<- countsFile[,-1]
#mergedData <- merge(mrbse2, countsFile, by.x = 3, by.y = 1, all.x = T, all.y = F)
dim(mergedData)
[1] 2656 304
head(mergedData)
#mergedData <- mergedData[,-1] # Remove the original (old) IDs
mergedData[1:5,1:5]
mergedData
NA
For each column: assign each miRNA a value 1-5 depending on which of the 20th percentiles it falls into. If the value is 0, it remains a 0
# Given a column, assign a number to each element from 0-5. All
# 0s get a 0, and the rest get a value according to the 20th percentile
# that it falls in among the non-zero values.
quintize <- function(vec) {
qntls <- c(0, quantile(vec[which(vec != 0)], 0.2*(1:4)))
vec2 <- sapply(vec, function(x) {
if (x == 0) return(0)
else return(max(which(x > qntls)))
})
return(vec2)
}
mergedData[,3:ncol(mergedData)] = lapply(mergedData[,3:ncol(mergedData)], FUN = function(y){as.numeric(y)})
dim(mergedData)
[1] 2656 304
quintizedData <- apply(mergedData[,-c(1:2)], 2, quintize)
quintizedData <- cbind(mergedData[,1:2], quintizedData)
quintizedData[1:5,1:5]
dim(quintizedData)
[1] 2656 304
Get the meta information
meta <- read.table(paste(sep="", data_path,"/meta_celltype_tissue_sRNA.txt"), header = T, sep = "\t")
meta$Sample_ID <- gsub("BioSample: https://www.ncbi.nlm.nih.gov/biosample/", "", meta$X.Sample_relation) # extract only SAMN IDs from X.Sample_relation
meta
Create Disease, Tissue, and Group (Tissue_Disease) columns
meta$Tissue <- str_replace_all(meta$X.Sample_source_name_ch1, " tissue", "")
meta$Tissue <- str_replace_all(meta$Tissue, "[^A-Za-z0-9]+", "\\.")
meta$Tissue <- str_replace_all(meta$Tissue, "\\.donor\\.[0-9]+", "")
meta$Tissue <- str_replace_all(meta$Tissue, "-", ".")
meta$Tissue <- str_replace_all(meta$Tissue, " ", ".")
meta$Tissue <- tolower(meta$Tissue)
meta$Tissue <- str_replace_all(meta$Tissue, "cb.cd34.lin.", "cb.cd34.lin")
meta$Disease <- "" #all normal
meta$Group <- apply(meta[,c(ncol(meta)-1,ncol(meta))], 1, function(x) paste(x[1], x[2], sep="_"))
meta$Group <- str_replace_all(meta$Group, "_", "")
head(meta)
Grouping: Combine replicates (samples that have the same type)
uniqueTissues <- unique(meta$Group)
meta$Sample_ID %in% names(quintizedData)
[1] TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[24] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[47] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[70] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[93] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[116] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[139] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[162] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[185] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
meta <- meta[!(meta$Sample_ID == "SAMN13014236"),] # Sample does not exist in counts file
# Remove columns in quintizedData that does not exist in meta (that means the sample type is not tissue and not cell type)
names(quintizedData) %in% meta$Sample_ID
[1] FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[24] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
[47] FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[70] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE
[93] TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE FALSE TRUE TRUE FALSE FALSE TRUE TRUE
[116] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[139] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
[162] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[185] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[208] TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[231] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
[254] TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
[277] TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
[300] TRUE TRUE TRUE TRUE FALSE
quintizedData2 <- quintizedData[,colnames(quintizedData) %in% meta$Sample_ID]
# Add Name and Seq columns
quintizedData2 <- cbind(quintizedData[,1:2], quintizedData2)
#quintizedData2 <- cbind(quintizedData$Seq,quintizedData2)
#quintizedData2 <- cbind(quintizedData$Name,quintizedData2)
#names(quintizedData2)[1] <- "Name"
#names(quintizedData2)[2] <- "Seq"
#names(quintizedData2) %in% meta$Sample_ID
combinedData <- lapply(uniqueTissues, function(x) {
df <- as.data.frame(quintizedData2[,meta$Sample_ID[which(meta$Group == x)]])
return(rowMeans(df))
})
names(combinedData) <- uniqueTissues
combinedData <- do.call(cbind, combinedData)
combinedData <- cbind(quintizedData2[,1:2], combinedData)
dim(combinedData)
[1] 2656 206
Ensure that all the group names appear in the ontology or the corrections file.
Get the ontology and correction files
ont <- read.table(paste(sep="", generaldata_path,"/ontology.txt"), header = F, sep = "\t")
corr <- read.table(paste(sep="", generaldata_path,"/corrections.txt"), header = T, sep = "\t")
head(ont)
Aim to have 0 as a result, meaning all terms are either already in the ontology or the correction file. Or else: update correction file or ontology file, or both.
trms <- unique(unlist(strsplit(names(combinedData)[3:ncol(combinedData)], "_"))) # Splits the composite terms that contain both tissue+disease
if (length(which(trms %in% union(corr[,1], unique(unlist(ont[,c(1,3)]))))) != 0){
trms[-which(trms %in% union(corr[,1], unique(unlist(ont[,c(1,3)]))))] # Which terms aren't in the ontology or the corrected terms
} else {
trms
}
character(0)
Correct current terms that need to be corrected.
colnames(combinedData) <- sapply(colnames(combinedData), function(z) {
y <- strsplit(z, "_")[[1]]
retVal <- sapply(y, function(x) {
if (x %in% corr$currentTerm) {
return(corr$correctedTerm[match(x, corr$currentTerm)])
} else {
return(x)
}
})
return(paste(retVal, collapse = "_"))
})
dim(combinedData)
[1] 2656 206
head(combinedData)
Add the Canonical column and Source column
n<-dim(combinedData)[1]
data<- cbind(combinedData["Name"],combinedData["Seq"],Canonical=rep(T, n) , Source=rep("Lorenzi*", n) , combinedData[,3:dim(combinedData)[2] ])
colnames(data)<- c("Name", "Seq", "Canonical", "Source", colnames(combinedData)[-c(1,2)])
#data <- combinedData # all tissues have to exist in ontology or correction files
#data$Canonical <- T
#data$Source <- "Lorenzi*"
#data <- data[,c(1,2,ncol(data)-1,ncol(data),3:(ncol(data)-2))] # Set columns alignment
colnames(data)
[1] "Name" "Seq"
[3] "Canonical" "Source"
[5] "alveolar.macrophage" "monocyte"
[7] "immature.monocyte.derived.dendritic.cell" "mature.monocyte.derived.dendritic.cell"
[9] "brain.vascular.smooth.muscle.cell" "brain.vascular.adventitial.fibroblast"
[11] "brain.vascular.pericyte" "choroid.plexus.epithelial.cell"
[13] "choroid.plexus.fibroblast" "meningeal.cell"
[15] "neuron" "oligodendrocyte.precursor.cell"
[17] "schwann.cell" "perineurial.cell"
[19] "astrocyte" "cerebellar.astrocyte"
[21] "spinal.cord.astrocyte" "hippocampal.astrocyte"
[23] "astrocytes.brain.stem" "midbrain.astrocyte"
[25] "retinal.astrocyte" "dermal.microvascular.endothelial.cell"
[27] "dermal.lymphatic.endothelial.cell" "keratinocyte"
[29] "keratinocyte" "keratinocyte"
[31] "epidermal.melanocyte" "epidermal.melanocyte"
[33] "epidermal.melanocyte" "epidermal.melanocyte"
[35] "fibroblast.of.dermis" "fibroblast.of.dermis"
[37] "fibroblast.of.dermis" "fibroblast.of.dermis"
[39] "hair.dermal.papilla" "hair.germinal.matrix"
[41] "hair.follicular.sheath.outer.root" "hair.follicular.sheath.inner.root"
[43] "hair.follicular.keratinocyte" "lymphatic.endothelial.cell"
[45] "lymphatic.fibroblast" "tonsil.endothelial.cell"
[47] "tonsil.epithelial.cell" "tonsil.fibroblast"
[49] "oral.keratinocyte" "gingival.fibroblast"
[51] "periodontal.ligament.fibroblast" "esophageal.smooth.muscle.cell"
[53] "esophageal.epithelial.cell" "esophageal.fibroblast"
[55] "gastric.smooth.muscle.cell" "intestinal.smooth.muscle.cell"
[57] "intestinal.fibroblast" "colonic.microvascular.endothelial.cell"
[59] "colonic.smooth.muscle.cell" "colonic.epithelial.cell"
[61] "rectal.smooth.muscle.cell" "pulmonary.microvascular.endothelial.cell"
[63] "pulmonary.artery.endothelial.cell" "pulmonary.artery.smooth.muscle.cell"
[65] "pulmonary.artery.fibroblast" "pulmonary.alveolar.epithelial.cell"
[67] "bronchial.epithelial.cell" "tracheal.epithelial.cell"
[69] "small.airway.epithelial.cell" "fibroblast.of.lung"
[71] "fibroblast.of.lung" "bronchial.smooth.muscle.cell"
[73] "tracheal.smooth.muscle.cell" "skeletal.muscle.cell"
[75] "skeletal.muscle.satellite.cell" "skeletal.muscle.myoblast"
[77] "adrenal.microvascular.endothelium" "adrenal.cortical.cell"
[79] "adrenal.gland.fibroblast" "thyroid.fibroblast"
[81] "pancreatic.stellate.cell" "thymus.fibroblast"
[83] "renal.glomerular.endothelial.cell" "renal.proximal.tubular.epithelial.cell"
[85] "renal.cortical.epithelial.cell" "renal.epithelial.cell"
[87] "renal.mesangial.cell" "bladder.microvascular.endothelial.cell"
[89] "bladder.smooth.muscle.cell" "urothelial.cell"
[91] "prostate.microvascular.endothelial.cell" "prostate.epithelial.cell"
[93] "prostate.fibroblast" "seminal.vesicle.microvascular.endothelial.cell"
[95] "seminal.vesicle.epithelial.cell" "seminal.vesicle.fibroblast"
[97] "calvarial.osteoblast" "femural.osteoblast"
[99] "synoviocyte" "nucleus.pulposus.cell"
[101] "annulus.pulposus.cell" "hepatic.sinusoidal.endothelial.cell"
[103] "hepatocyte" "hepatic.stellate.cell"
[105] "gall.bladder.fibroblast" "splenic.endothelial.cell"
[107] "splenic.fibroblast" "cardiac.microvascular.endothelial.cell"
[109] "coronary.artery.endothelial.cell" "aortic.endothelial.cell"
[111] "aortic.smooth.muscle.cell" "aortic.fibroblast"
[113] "cardiac.muscle.fibre" "cardiac.myocyte"
[115] "cardiac.fibroblast" "cardiac.ventricle.fibroblast"
[117] "cardiac.atrium.fibroblast" "fibroblast.of.cardiac.tissue"
[119] "cardiac.atrium.fibroblast" "pericardial.fibroblast"
[121] "corneal.epithelial.cell" "keratocyte"
[123] "retinal.pigment.epithelial.cell" "lens.epithelial.cell"
[125] "iris.pigment.epithelial.cell" "fibroblast.of.the.conjunctiva"
[127] "non.pigment.ciliary.epithelial.cell" "trabecular.meshwork.cell"
[129] "choroid.fibroblast" "myometrial.microvascular.endothelial.cell"
[131] "endometrial.microvascular.endothelial.cell" "myometrial.smooth.muscle.cell"
[133] "amniotic.epithelial.cell" "villous.trophoblast"
[135] "fibroblast.of.villous.mesenchyme" "amniotic.mesenchymal.stromal.cell"
[137] "chorionic.mesenchymal.stromal.cell" "adipose.microvascular.endothelial.cell"
[139] "preadipocyte.visceral" "subcutaneous.preadipocyte"
[141] "ovarian.microvascular.endothelial.cell" "ovarian.surface.epithelial.cell"
[143] "ovarian.fibroblast" "mesenchymal.stem.cell.of.the.bone.marrow"
[145] "mesenchymal.stem.cell.adipose" "hepatic.mesenchymal.stem.cell"
[147] "mesenchymal.stem.cell.of.umbilical.cord" "pulmonary.mesenchymal.stem.cell"
[149] "vertebral.mesenchymal.stem.cell" "mammary.endothelial.cell"
[151] "mammary.epithelial.cell" "mammary.fibroblast"
[153] "umbilical.vein.endothelial.cell" "umbilical.artery.endothelial.cell"
[155] "umbilical.vein.smooth.muscle.cell" "umbilical.artery.smooth.muscle.cell"
[157] "hematopoietic.stem.cell" "t.lymphocyte.cd3"
[159] "monocyte.cd14" "NK.cells"
[161] "b.lymphocyte.cd19" "brain.microvascular.endothelial.cell"
[163] "articular.chondrocyte" "ileum"
[165] "jejunum" "duodenum"
[167] "right.colon" "distal.colon"
[169] "esophagus" "trachea"
[171] "vena.cava" "pericardium"
[173] "left.atrium" "left.ventricle"
[175] "right.atrium" "right.ventricle"
[177] "oviduct" "thyroid.gland"
[179] "uterus" "lymph.node"
[181] "placenta" "breast"
[183] "pancreas" "adipose.tissue"
[185] "liver" "brain"
[187] "thymus" "heart"
[189] "lung" "spleen"
[191] "testis" "ovary"
[193] "kidney" "skeletal.muscle"
[195] "small.intestine" "colon"
[197] "prostate.gland" "bladder"
[199] "uterine.cervix" "adrenal.gland"
[201] "stomach" "cerebellum"
[203] "brain.stem" "frontal.lobe"
[205] "corpus.striatum" "occipital.lobe"
[207] "parietal.lobe" "brain"
dim(data)
[1] 2656 208
head(data)
NA
Convert to long format
data_long <- melt(data, id.vars=c("Name", "Seq", "Canonical", "Source"))
Binarization
data_long <- data_long[!is.na(data_long$value),]
data_long$Binary <- sapply(data_long$value, function(x) if (x == 0) return(0) else return(1))
names(data_long) <- c("miR", "Seq", "Canonical", "Source", "Tissue", "Scale", "Binary")
write.table(data_long, paste(sep="", result_path,"/lorenzi_longData.txt"), sep = "\t", row.names = F, col.names = T, quote = F)
write.table(unique(data_long[,1:3]), paste(sep="", result_path,"/lorenzi_miRNAs.txt"), sep = "\t", row.names = F, col.names = T, quote = F)
writeLines(as.character(unique(data_long$Tissue)), paste(sep="", result_path,"/lorenzi_tissues.txt"))